---
title: "Visualization Notebook"
output: html_notebook
---

This notebook provides the code for the plots in the paper. Note that the code
for the PR curve for the random forest baseline is located in
`../prediction/random_forest.Rmd`.

```{r}
library(tidyverse)
library(ggpubr)
source("../prediction/heuristic_language_prediction.R")
```

PR/ROC Curve showing the language-matching algorithm's performance using L2 data
```{r}
# This is calculated in threshold_calibration.Rmd.
threshold_precision_volume <- read_rds(
  "../data/threshold_precision_volume_l2.rds"
) %>%
  mutate(
    recall_l2 = tp_l2 / (tp_l2 + fn_l2),
    fpr_l2 = fp_l2 / (fp_l2 + tn_l2)
  )

auc <- integrate(
  approxfun(
    threshold_precision_volume$fpr_l2,
    threshold_precision_volume$recall_l2,
    yleft = 0,
    yright = 1
  ),
  0,
  1
)
aucpr <- integrate(
  approxfun(
    threshold_precision_volume$recall_l2,
    threshold_precision_volume$precision_l2,
    yleft = 1,
    yright = 0),
  0,
  1
)
p1 <- ggplot(threshold_precision_volume %>% rename(Threshold = threshold)) +
    geom_path(aes(fpr_l2, recall_l2, colour = Threshold)) +
    xlab("False positive rate") +
    ylab("True positive rate") +
    ggtitle("Receiver operating characteristic curve") +
    scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, .25)) +
    annotate(
      "text",
      x = .5,
      y = .25,
      label = paste0("Area under the curve: ", round(auc$value, 2))
    )
p2 <- ggplot(threshold_precision_volume %>% rename(Threshold = threshold)) +
    geom_path(aes(recall_l2, precision_l2, colour = Threshold)) +
    xlab("Recall") +
    ylab("Precision") +
    ggtitle("Precision-recall curve") +
    scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, .25)) +
    annotate(
      "text",
      x = .5,
      y = .25,
      label = paste0("Area under the curve: ", round(aucpr$value, 2))
    )

figs <- ggarrange(p1, p2, ncol = 2, common.legend = TRUE, legend = "bottom")

annotate_figure(
  figs,
  top = text_grob("Heuristic model performance")
)
```

"Heat map" displaying the distribution of Spanish speakers across features and cutoffs
```{r}
binned_train_df <- read_rds("../data/binned_train_df_l2.rds")
threshold <- read_rds("../data/threshold.rds")
cutoffs <- set_cutoffs(threshold, 10, binned_train_df, TRUE) %>%
  rename(
    `Age Sextile` = age_bin,
    `Address Score Septile` = addr_score_bin,
    `Count` = bin_count,
    `First Name Score Sextile` = first_name_score_bin,
  ) %>%
  mutate(
    `Last Name Score` = ifelse(
      last_name_score_bin == "(0.484,1]",
      "Highest",
      "Lowest"
    ),
    Flagged = ifelse(cutoff == "In", "Yes", "No")
  )
write_rds(cutoffs, "../data/cutoffs.rds")
```

```{r}
first_name_score_labels <- c(
  `[0,0.0538]` = "Lowest",
  `(0.0538,0.201]` = "",
  `(0.201,0.441]` = "",
  `(0.441,0.667]` = "",
  `(0.667,0.835]` = "",
  `(0.835,1]` = "Highest"
)

p <- ggplot(
  cutoffs,
  aes(
    x = `Age Sextile`,
    y = `Address Score Septile`,
    fill = perc_spanish * 100,
    size = Count,
    colour = Flagged
  )
) +
  geom_tile(size = 1, alpha = 0, aes(fill = Flagged)) +
  geom_point(shape = 21, color = "black") +
  scale_color_manual(values = c("#d9d9d9", "#000000")) +
  xlab("Age sextile") +
  ylab("Address score septile") +
  labs(fill = "% Spanish") +
  scale_fill_distiller(palette = "RdYlBu", direction = 1) +
  scale_x_discrete(
    guide = guide_axis(angle = 45),
    breaks = c("(13.3,24.8]", "(71.6,102]"),
    labels = c("Youngest", "Oldest")
  ) +
  scale_y_discrete(
    breaks = c("(0.0208,0.0565]", "(0.295,0.429]"),
    labels = c("Lowest", "Highest")
  ) +
  theme(
    legend.position = "bottom",
    panel.border = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.line = element_line(color = "black"),
    plot.title = element_text(vjust = 3)
  ) +
  ggtitle("Distribution of Spanish speakers") +
  scale_alpha_discrete(range = c(1, 0.15)) +
  facet_grid(
    `Last Name Score` ~ `First Name Score Sextile`,
    labeller = labeller(
      `First Name Score Sextile` = as_labeller(first_name_score_labels))
    )
```

```{r, fig.align = "center"}
p <- ggplot(
    cutoffs,
    aes(
        x = `Age Sextile`,
        y = `Address Score Septile`
    )
) +
    geom_tile(aes(size = 0, fill = Flagged), linetype = "blank") +
    geom_point(aes(size = Count, colour = perc_spanish * 100)) +
    geom_point(aes(size = Count), shape = 1, colour = "black", alpha = 0.5) +
    xlab("Age sextile") +
    ylab("Address score septile") +
    labs(colour = "% Spanish", size = "Count") +
    scale_color_distiller(palette = "RdYlBu", direction = 1) +
    scale_fill_manual(values = c("#d9d9d9", "#bababa")) +
    scale_x_discrete(
        guide = guide_axis(angle = 45),
        breaks = c("(13.3,24.8]", "(71.6,102]"),
        labels = c("Youngest", "Oldest")
    ) +
    scale_y_discrete(
        breaks = c("(0.0208,0.0565]", "(0.295,0.429]"),
        labels = c("Lowest", "Highest")
    ) +
    theme(
        legend.position = "bottom",
        panel.border = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        axis.line = element_line(color = "black"),
        plot.title = element_text(vjust = 3)
    ) +
    ggtitle("Distribution of Spanish speakers") +
    scale_alpha_discrete(range = c(1, 0.15)) +
    facet_grid(
        `Last Name Score` ~ `First Name Score Sextile`,
        labeller = labeller(
          `First Name Score Sextile` = as_labeller(first_name_score_labels))
        )

p +
  theme(plot.margin = unit(c(1, 0.7, 0.7, 0.7), "cm")) +
  guides(fill = FALSE) +
  guides(size = guide_legend(override.aes = list(fill = NA)))

grid::grid.text(
  unit(0.976, "npc"),
  0.58,
  label = "Last name score half",
  rot = 270,
  gp = gpar(fontsize = "11")
)

grid::grid.text(
  unit(0.51, "npc"),
  unit(.9, "npc"),
  label = "First name score sextile",
  rot = 0,
  gp = gpar(fontsize = "11")
)

p2 <- grid.grab()

grid.newpage()
grid.draw(p2)
```

Plot showing histograms of the binning distribution across the features
```{r}
age <- binned_train_df %>%
  group_by(age_bin) %>%
  summarize(
    perc_spanish = mean(is_spanish),
    Count = n()
  ) %>%
  arrange(age_bin) %>%
  ggplot(aes(x = age_bin, y = perc_spanish, size = Count, group = 1)) +
  geom_point() +
  geom_line(linetype = "dashed", size = 0.4, colour = "black") +
  xlab("Age sextile") +
  theme(
    axis.title.y = element_blank()
  ) + scale_x_discrete(
    breaks = c("(13.3,24.8]", "(71.6,102]"),
    labels = c("Youngest", "Oldest")
  )

addr_score <- binned_train_df %>%
  group_by(addr_score_bin) %>%
  summarize(
    perc_spanish = mean(is_spanish),
    Count = n()
  ) %>%
  arrange(addr_score_bin) %>%
  ggplot(aes(x = addr_score_bin, y = perc_spanish, size = Count, group = 1)) +
  geom_point() +
  geom_line(linetype = "dashed", size = 0.4, colour = "black") +
  xlab("Address score septile") +
  theme(
    axis.title.y = element_blank()
  ) + scale_x_discrete(
    breaks = c("(0.0208,0.0565]", "(0.295,0.429]"),
    labels = c("Lowest", "Highest")
  )

first_name_score <- binned_train_df %>%
  group_by(first_name_score_bin) %>%
  summarize(
    perc_spanish = mean(is_spanish),
    Count = n()
  ) %>%
  arrange(first_name_score_bin) %>%
  ggplot(
    aes(x = first_name_score_bin, y = perc_spanish, size = Count, group = 1)
  ) +
  geom_point() +
  geom_line(linetype = "dashed", size = 0.4, colour = "black") +
  xlab("First name score sextile") +
  theme(
    axis.title.y = element_blank()
  ) + scale_x_discrete(
    breaks = c("(0.0538,0.201]", "(0.835,1]"),
    labels = c("Lowest", "Highest")
  )

last_name_score <- binned_train_df %>%
  group_by(last_name_score_bin) %>%
  summarize(
    perc_spanish = mean(is_spanish),
    Count = n()
  ) %>%
  arrange(last_name_score_bin) %>%
  ggplot(
    aes(x = last_name_score_bin, y = perc_spanish, size = Count, group = 1)
  ) +
  geom_point() +
  geom_line(linetype = "dashed", size = 0.4, colour = "black") +
  xlab("Last name score half") +
  theme(
    axis.title.y = element_blank()
  ) + scale_x_discrete(
    breaks = c("[0,0.484]", "(0.484,1]"),
    labels = c("Lowest", "Highest")
  )

full <- ggarrange(
  age,
  addr_score,
  first_name_score,
  last_name_score,
  common.legend = TRUE,
  ncol = 4,
  nrow = 1,
  legend = "bottom"
)

annotate_figure(
  full,
  top = text_grob("Proportion of Spanish speakers by attribute"),
  left = text_grob("Proportion Spanish speakers", rot = 90)
)
```

Plot demonstrating the elbow method for determining the optimal number
of bins for each feature
```{r}
df <- readRDS("../data/cv_bins_l2.rds")
features <- c(
  "age",
  "spanish_address_score",
  "spanish_first_name_score",
  "spanish_surname_score"
)

age_bins <- df %>%
  filter(feature == "age") %>%
  group_by(num_bins) %>%
  summarize(
    rmse = mean(res)
  ) %>%
  ggplot(aes(x = num_bins, y = rmse)) +
  geom_point() +
  geom_line() +
  xlab("Number of bins (k)") +
  ylab("Root mean square error (RMSE)") +
  ggtitle("Age") +
  geom_vline(xintercept = 6, linetype = "dashed") +
  theme(plot.title = element_text(size = 12)) +
  annotate("text", x = 24, y = 15, label = "Elbow at k = 6\nRMSE = 3.53")

address_score_bins <- df %>%
  filter(feature == "spanish_address_score") %>%
  group_by(num_bins) %>%
  summarize(
    rmse = mean(res)
  ) %>%
  ggplot(aes(x = num_bins, y = rmse)) +
  geom_point() +
  geom_line() +
  xlab("Number of bins (k)") +
  ylab("Root mean square error (RMSE)") +
  ggtitle("Address score") +
  geom_vline(xintercept = 7, linetype = "dashed") +
  theme(plot.title = element_text(size = 12)) +
  annotate("text", x = 24, y = .047, label = "Elbow at k = 7\nRMSE = 0.01")

first_name_score_bins <- df %>%
  filter(feature == "spanish_first_name_score") %>%
  group_by(num_bins) %>%
  summarize(
    rmse = mean(res)
  ) %>%
  ggplot(aes(x = num_bins, y = rmse)) +
  geom_point() +
  geom_line() +
  xlab("Number of bins (k)") +
  ylab("Root mean square error (RMSE)") +
  ggtitle("First name score") +
  geom_vline(xintercept = 6, linetype = "dashed") +
  theme(plot.title = element_text(size = 12)) +
  annotate("text", x = 24, y = .25, label = "Elbow at k = 6\nRMSE = 0.03")

last_name_score_bins <- df %>%
  filter(feature == "spanish_surname_score") %>%
  group_by(num_bins) %>%
  summarize(
    rmse = mean(res)
  ) %>%
  ggplot(aes(x = num_bins, y = rmse)) +
  geom_point() +
  geom_line() +
  xlab("Number of bins (k)") +
  ylab("Root mean square error (RMSE)") +
  ggtitle("Last name score") +
  geom_vline(xintercept = 2, linetype = "dashed") +
  theme(plot.title = element_text(size = 12)) +
  annotate("text", x = 24, y = .34, label = "Elbow at k = 2\nRMSE = 0.06")

all_elbow_method <- ggarrange(
  age_bins,
  address_score_bins,
  first_name_score_bins,
  last_name_score_bins,
  common.legend = TRUE,
  ncol = 2,
  nrow = 2
)

annotate_figure(
  all_elbow_method,
  top = text_grob("Elbow method for finding optimal number of bins")
)
```

